{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
    "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "you may not use this file except in compliance with the License.\n",
    "You may obtain a copy of the License at\n",
    "    http://www.apache.org/licenses/LICENSE-2.0\n",
    "Unless required by applicable law or agreed to in writing, software\n",
    "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "See the License for the specific language governing permissions and\n",
    "limitations under the License."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline  \n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gc\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "from transformers import *\n",
    "tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7min 35s, sys: 39.7 s, total: 8min 15s\n",
      "Wall time: 9min 36s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t6417\\t3410\\t3398\\t3184\\t1909\\t56910\\t1683...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t14120\\t131\\t120\\t120\\t188\\t119\\t11170\\t12...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t62342\\t10858\\t54439\\t19571\\t22480\\t7831\\t...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t58955\\t10898\\t103305\\t1901\\t16181\\t7168\\t...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t2435\\t5656\\t2594\\t8279\\t8623\\t1925\\t64126...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens  id\n",
       "0  101\\t6417\\t3410\\t3398\\t3184\\t1909\\t56910\\t1683...   0\n",
       "1  101\\t14120\\t131\\t120\\t120\\t188\\t119\\t11170\\t12...   1\n",
       "2  101\\t62342\\t10858\\t54439\\t19571\\t22480\\t7831\\t...   2\n",
       "3  101\\t58955\\t10898\\t103305\\t1901\\t16181\\t7168\\t...   3\n",
       "4  101\\t2435\\t5656\\t2594\\t8279\\t8623\\t1925\\t64126...   4"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "df = pd.read_csv('../input/training.tsv', sep='\\x01', header=None, usecols=[0] )\n",
    "df.columns = ['text_tokens']\n",
    "df['id']   = np.arange( df.shape[0] )\n",
    "df['id']   = df['id'].astype(np.uint32)\n",
    "gc.collect()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 47.5 s, sys: 4.09 s, total: 51.6 s\n",
      "Wall time: 57.5 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t47185\\t10157\\t100986\\t10343\\t55422\\t119\\t...</td>\n",
       "      <td>148075238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t6006\\t5086\\t1939\\t7418\\t3601\\t6406\\t1913\\...</td>\n",
       "      <td>148075239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t56898\\t137\\t44851\\t10317\\t11490\\t10112\\t1...</td>\n",
       "      <td>148075240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t13497\\t10437\\t94005\\t11161\\t73632\\t11067\\...</td>\n",
       "      <td>148075241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t24781\\t10152\\t42041\\t38268\\t10301\\t10798\\...</td>\n",
       "      <td>148075242</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens         id\n",
       "0  101\\t47185\\t10157\\t100986\\t10343\\t55422\\t119\\t...  148075238\n",
       "1  101\\t6006\\t5086\\t1939\\t7418\\t3601\\t6406\\t1913\\...  148075239\n",
       "2  101\\t56898\\t137\\t44851\\t10317\\t11490\\t10112\\t1...  148075240\n",
       "3  101\\t13497\\t10437\\t94005\\t11161\\t73632\\t11067\\...  148075241\n",
       "4  101\\t24781\\t10152\\t42041\\t38268\\t10301\\t10798\\...  148075242"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dv = pd.read_csv('../input/val.tsv', sep='\\x01', header=None, usecols=[0] )\n",
    "dv.columns = ['text_tokens']\n",
    "dv['id']   = np.arange( df.shape[0] , df.shape[0]+dv.shape[0] )\n",
    "dv['id']   = dv['id'].astype(np.uint32)\n",
    "gc.collect()\n",
    "dv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 36.6 s, sys: 2.46 s, total: 39 s\n",
      "Wall time: 35.2 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t3100\\t5477\\t3028\\t4348\\t1924\\t111806\\t186...</td>\n",
       "      <td>163202922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t56898\\t137\\t36110\\t10400\\t168\\t64062\\t131...</td>\n",
       "      <td>163202923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t56898\\t137\\t179\\t36816\\t10775\\t40546\\t513...</td>\n",
       "      <td>163202924</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t56898\\t137\\t22038\\t40663\\t12892\\t45389\\t1...</td>\n",
       "      <td>163202925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t56898\\t137\\t11699\\t10174\\t10738\\t37816\\t1...</td>\n",
       "      <td>163202926</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens         id\n",
       "0  101\\t3100\\t5477\\t3028\\t4348\\t1924\\t111806\\t186...  163202922\n",
       "1  101\\t56898\\t137\\t36110\\t10400\\t168\\t64062\\t131...  163202923\n",
       "2  101\\t56898\\t137\\t179\\t36816\\t10775\\t40546\\t513...  163202924\n",
       "3  101\\t56898\\t137\\t22038\\t40663\\t12892\\t45389\\t1...  163202925\n",
       "4  101\\t56898\\t137\\t11699\\t10174\\t10738\\t37816\\t1...  163202926"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dt = pd.read_csv('../input/competition_test.tsv', sep='\\x01', header=None, usecols=[0] )\n",
    "dt.columns = ['text_tokens']\n",
    "dt['id']   = np.arange( df.shape[0]+dv.shape[0], df.shape[0]+dv.shape[0]+dt.shape[0] )\n",
    "dt['id']   = dt['id'].astype(np.uint32)\n",
    "gc.collect()\n",
    "dt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "acd29b09856e4ea993c970d754e00703",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=148075238.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 1h 46min 30s, sys: 1min 9s, total: 1h 47min 40s\n",
      "Wall time: 1h 46min 36s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "63"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "df['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\\t') ] ) for t in tqdm(df.text_tokens.values) ] \n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "969bbe1c6c88418396cdedd1dc8ec9b1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=15127684.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 10min 47s, sys: 6.55 s, total: 10min 54s\n",
      "Wall time: 10min 47s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dv['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\\t') ] ) for t in tqdm(dv.text_tokens.values) ] \n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "64a21d3a37304c7ea8c5c38324c78719",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=12434838.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 8min 56s, sys: 5.64 s, total: 9min 2s\n",
      "Wall time: 8min 56s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "23"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dt['tweet'] = [ tokenizer.decode( [ int(n) for n in t.split('\\t') ] ) for t in tqdm(dt.text_tokens.values) ] \n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4min 17s, sys: 7.84 s, total: 4min 25s\n",
      "Wall time: 4min 24s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "      <th>tweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t6417\\t3410\\t3398\\t3184\\t1909\\t56910\\t1683...</td>\n",
       "      <td>0</td>\n",
       "      <td>[CLS] 美 容 室 変 えどきかな 〜 [UNK] [SEP]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t14120\\t131\\t120\\t120\\t188\\t119\\t11170\\t12...</td>\n",
       "      <td>1</td>\n",
       "      <td>[CLS] https://t.co/jbcBe1B5lP [SEP]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t62342\\t10858\\t54439\\t19571\\t22480\\t7831\\t...</td>\n",
       "      <td>2</td>\n",
       "      <td>[CLS] SNCタルコフ 部 門 企 業 説 明 会 始 めます. https://t.c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t58955\\t10898\\t103305\\t1901\\t16181\\t7168\\t...</td>\n",
       "      <td>3</td>\n",
       "      <td>[CLS] ありがとう 〜 でも 言 いたい 事 を 言 いたい 時 に 言 っているだけな...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t2435\\t5656\\t2594\\t8279\\t8623\\t1925\\t64126...</td>\n",
       "      <td>4</td>\n",
       "      <td>[CLS] 免 疫 力 雑 魚 すぎるから 一 週 間 絶 対 外 でない [SEP]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens  id  \\\n",
       "0  101\\t6417\\t3410\\t3398\\t3184\\t1909\\t56910\\t1683...   0   \n",
       "1  101\\t14120\\t131\\t120\\t120\\t188\\t119\\t11170\\t12...   1   \n",
       "2  101\\t62342\\t10858\\t54439\\t19571\\t22480\\t7831\\t...   2   \n",
       "3  101\\t58955\\t10898\\t103305\\t1901\\t16181\\t7168\\t...   3   \n",
       "4  101\\t2435\\t5656\\t2594\\t8279\\t8623\\t1925\\t64126...   4   \n",
       "\n",
       "                                               tweet  \n",
       "0                  [CLS] 美 容 室 変 えどきかな 〜 [UNK] [SEP]  \n",
       "1                [CLS] https://t.co/jbcBe1B5lP [SEP]  \n",
       "2  [CLS] SNCタルコフ 部 門 企 業 説 明 会 始 めます. https://t.c...  \n",
       "3  [CLS] ありがとう 〜 でも 言 いたい 事 を 言 いたい 時 に 言 っているだけな...  \n",
       "4        [CLS] 免 疫 力 雑 魚 すぎるから 一 週 間 絶 対 外 でない [SEP]  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "df['tweet'] = df['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )\n",
    "df['tweet'] = df['tweet'].apply( lambda x: x.replace('@ ', '@') )\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.5 s, sys: 344 ms, total: 23.9 s\n",
      "Wall time: 23.9 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "      <th>tweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t47185\\t10157\\t100986\\t10343\\t55422\\t119\\t...</td>\n",
       "      <td>148075238</td>\n",
       "      <td>[CLS] Funky techno Witch. https://t.co/YdfhIt7...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t6006\\t5086\\t1939\\t7418\\t3601\\t6406\\t1913\\...</td>\n",
       "      <td>148075239</td>\n",
       "      <td>[CLS] 空 港 で 財 布 置 き 忘 れたら 偶 然 隣 座 ってた 方 がフォロワー...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t56898\\t137\\t44851\\t10317\\t11490\\t10112\\t1...</td>\n",
       "      <td>148075240</td>\n",
       "      <td>[CLS] RT @LegadoDeKonoha : Eis o verdadeiro si...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t13497\\t10437\\t94005\\t11161\\t73632\\t11067\\...</td>\n",
       "      <td>148075241</td>\n",
       "      <td>[CLS] Para uma criança pequenina, que verá um ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t24781\\t10152\\t42041\\t38268\\t10301\\t10798\\...</td>\n",
       "      <td>148075242</td>\n",
       "      <td>[CLS] Why lesbian couples are more likely to d...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens         id  \\\n",
       "0  101\\t47185\\t10157\\t100986\\t10343\\t55422\\t119\\t...  148075238   \n",
       "1  101\\t6006\\t5086\\t1939\\t7418\\t3601\\t6406\\t1913\\...  148075239   \n",
       "2  101\\t56898\\t137\\t44851\\t10317\\t11490\\t10112\\t1...  148075240   \n",
       "3  101\\t13497\\t10437\\t94005\\t11161\\t73632\\t11067\\...  148075241   \n",
       "4  101\\t24781\\t10152\\t42041\\t38268\\t10301\\t10798\\...  148075242   \n",
       "\n",
       "                                               tweet  \n",
       "0  [CLS] Funky techno Witch. https://t.co/YdfhIt7...  \n",
       "1  [CLS] 空 港 で 財 布 置 き 忘 れたら 偶 然 隣 座 ってた 方 がフォロワー...  \n",
       "2  [CLS] RT @LegadoDeKonoha : Eis o verdadeiro si...  \n",
       "3  [CLS] Para uma criança pequenina, que verá um ...  \n",
       "4  [CLS] Why lesbian couples are more likely to d...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dv['tweet'] = dv['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )\n",
    "dv['tweet'] = dv['tweet'].apply( lambda x: x.replace('@ ', '@') )\n",
    "dv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 21 s, sys: 276 ms, total: 21.2 s\n",
      "Wall time: 21.2 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_tokens</th>\n",
       "      <th>id</th>\n",
       "      <th>tweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101\\t3100\\t5477\\t3028\\t4348\\t1924\\t111806\\t186...</td>\n",
       "      <td>163202922</td>\n",
       "      <td>[CLS] 埼 玉 土 日 じゃん ！. 行 けないこともないか... ？ [UNK]. え...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101\\t56898\\t137\\t36110\\t10400\\t168\\t64062\\t131...</td>\n",
       "      <td>163202923</td>\n",
       "      <td>[CLS] RT @meanie _ ark : いいね ・ RTでそれぞれポイントになるみ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101\\t56898\\t137\\t179\\t36816\\t10775\\t40546\\t513...</td>\n",
       "      <td>163202924</td>\n",
       "      <td>[CLS] RT @kwonjiyongbabe : this vip be hanging...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101\\t56898\\t137\\t22038\\t40663\\t12892\\t45389\\t1...</td>\n",
       "      <td>163202925</td>\n",
       "      <td>[CLS] RT @massudessu13 : 너무좋아 [UNK].. # 박유천 ht...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>101\\t56898\\t137\\t11699\\t10174\\t10738\\t37816\\t1...</td>\n",
       "      <td>163202926</td>\n",
       "      <td>[CLS] RT @AtkArena : Welcome to our new CS : G...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         text_tokens         id  \\\n",
       "0  101\\t3100\\t5477\\t3028\\t4348\\t1924\\t111806\\t186...  163202922   \n",
       "1  101\\t56898\\t137\\t36110\\t10400\\t168\\t64062\\t131...  163202923   \n",
       "2  101\\t56898\\t137\\t179\\t36816\\t10775\\t40546\\t513...  163202924   \n",
       "3  101\\t56898\\t137\\t22038\\t40663\\t12892\\t45389\\t1...  163202925   \n",
       "4  101\\t56898\\t137\\t11699\\t10174\\t10738\\t37816\\t1...  163202926   \n",
       "\n",
       "                                               tweet  \n",
       "0  [CLS] 埼 玉 土 日 じゃん ！. 行 けないこともないか... ？ [UNK]. え...  \n",
       "1  [CLS] RT @meanie _ ark : いいね ・ RTでそれぞれポイントになるみ...  \n",
       "2  [CLS] RT @kwonjiyongbabe : this vip be hanging...  \n",
       "3  [CLS] RT @massudessu13 : 너무좋아 [UNK].. # 박유천 ht...  \n",
       "4  [CLS] RT @AtkArena : Welcome to our new CS : G...  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dt['tweet'] = dt['tweet'].apply( lambda x: x.replace('https : / / t. co / ', 'https://t.co/') )\n",
    "dt['tweet'] = dt['tweet'].apply( lambda x: x.replace('@ ', '@') )\n",
    "\n",
    "dt.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_parquet( 'train-tweet-1.parquet' )\n",
    "dv.to_parquet( 'test0-tweet-1.parquet' )\n",
    "dt.to_parquet( 'test1-tweet-1.parquet' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
